In [30]:
import prov, requests, pandas as pd, io, git, datetime, urllib
from prov.model import ProvDocument
In [31]:
pg = ProvDocument()
kn_id = "data/data-gov-au/number-of-properties-by-suburb-and-planning-zone-csv"
pg.add_namespace('kn', 'http://oznome.csiro.au/id/')
pg.add_namespace('void', 'http://vocab.deri.ie/void#')
pg.add_namespace('foaf', 'http://xmlns.com/foaf/0.1/')
pg.add_namespace('dc', 'http://purl.org/dc/elements/1.1/')
pg.add_namespace('doap', 'http://usefulinc.com/ns/doap#')
Out[31]:
Processing could be anything and represents one or more provenance activities. In this example we use a KN metadata record to retrieve data on residential properities. We intersperse definition of provenance into this processing but we could have easily seperated it out and performed it after the processing steps
First we define an entity that describes the KN metadata records which we are using here
In [32]:
input_identifier = 'kn:'+ kn_id
input_entity = pg.entity(input_identifier, {'prov:label': 'road static parking off street', 'prov:type': 'void:Dataset'})
Then we proceed to drill down to get detailed data that we've found associated with this record
In [33]:
start_time = datetime.datetime.now()
In [34]:
response = requests.get('https://data.sa.gov.au/data/dataset/d080706c-2c05-433d-b84d-9aa9b6ccae73/resource/4a47e89b-4be8-430d-8926-13b180025ac6/download/city-of-onkaparinga---number-of-properties-by-suburb-and-planning-zone-2016.csv')
In [35]:
url_data = response.content
In [36]:
dataframe = pd.read_csv(io.StringIO(url_data.decode('utf-8')))
In [37]:
dataframe.columns
Out[37]:
Our processing is very simple we are subsetting the original dataset here and creating a new dataset called residential_frame that we will then save to disk
In [38]:
residential_frame = dataframe[dataframe['Zone_Description'] == 'Residential']
In [39]:
residential_frame_file_name = "filtered_residential_data.csv"
residential_frame.to_csv(residential_frame_file_name)
end_time = datetime.datetime.now()
Ideally we would store our output provenance entity somewhere known and persistent and identify it with a persistent url. However we can still mint an identifier and then describe the dataset in useful ways that will make it easy to find and query from later. To do this we create a new entity record and use the file name and sha hash of the file to describe it.
In [40]:
import subprocess
output = subprocess.check_output("sha1sum "+ residential_frame_file_name, shell=True)
In [41]:
sha1 = str(output).split(' ')[0][2:]
In [42]:
output_identifier = 'kn:' + sha1
output_entity = pg.entity(output_identifier , {'prov:label': residential_frame_file_name, 'prov:type': 'void:Dataset'})
We need to connect the entity representing the input data to the entity representing the output data and we may want to describe the activity that transforms the input into the output. In this case the activity is this Jupyter Notebook. One way of storing provenance information in it is to make sure it is version controlled in git and then record these details.
In [43]:
import re, ipykernel, json
In [44]:
%%javascript
var nb = Jupyter.notebook;
var port = window.location.port;
nb.kernel.execute("NB_Port = '" + port + "'");
In [45]:
kernel_id = re.search('kernel-(.*).json', ipykernel.connect.get_connection_file()).group(1)
response = requests.get('http://127.0.0.1:{port}/jupyter/api/sessions'.format(port=NB_Port))
response.content
matching = [s for s in json.loads(response.text) if s['kernel']['id'] == kernel_id]
if matching:
matched = matching[0]['notebook']['path']
In [46]:
notebook_file_name = matched.split('/')[-1]
One gotcha here is that we need to make sure this notebooks relevant version has been committed and pushed to the remote. So do that and then execute these cells.
In [47]:
repo = git.Repo('./', search_parent_directories=True)
current_git_sha = repo.head.object.hexsha
current_git_remote = list(repo.remotes['origin'].urls)[0]
In [48]:
current_git_sha
Out[48]:
In [49]:
current_git_remote
Out[49]:
In [50]:
process_identifier = 'kn:' + 'notebook/' + urllib.parse.quote(notebook_file_name + current_git_sha, safe='')
In [51]:
process_identifier
process_entity = pg.entity(process_identifier, other_attributes={'dc:description': 'a jupyter notebook used that demonstrates provenance', 'doap:GitRepository' : current_git_remote, 'doap:Version' : current_git_sha })
In [52]:
import time
sunixtime = time.mktime(start_time.timetuple())
eunixtime = time.mktime(end_time.timetuple())
activity_identifier = 'kn:' + 'notebook/' + urllib.parse.quote(notebook_file_name + current_git_sha, safe='') + str(sunixtime) + str(eunixtime)
activity = pg.activity(activity_identifier, startTime=start_time, endTime=end_time)
In [53]:
pg.wasGeneratedBy(activity=activity, entity=output_entity)
Out[53]:
In [54]:
pg.used(activity=activity, entity=input_entity)
Out[54]:
In [55]:
pg.used(activity=activity, entity=process_entity)
Out[55]:
In [56]:
pg
Out[56]:
In [57]:
# visualize the graph
from prov.dot import prov_to_dot
dot = prov_to_dot(pg)
dot.write_png('prov.png')
Out[57]:
In [58]:
from IPython.display import Image
Image('prov.png')
Out[58]: